home *** CD-ROM | disk | FTP | other *** search
- /*
- PROGRAM NAME: select.c
-
- PURPOSE: 1) Compute Discrimination Value of terms,
- 2) Compute Poisson Distributions for terms,
- 3) Parition terms by their total within document frequencies,
- 4) Compute cohesion between pairs of terms,
- 5) Compute Dice's coefficient of similarity between two terms.
-
- INPUT FILES REQUIRED:
-
- 1) a direct file, sequences of:
-
- document# term weight
-
- (multiple entries for any document should be grouped together )
-
- 2) an inverted file, sequences of
-
- term document# weight
-
- (multiple entries for any term should be grouped together)
-
- NOTES: Filters such as stop lists and stemmers should be used before
- before running this program.
-
- PARAMETERS TO BE SET BY USER:
-
- 1) MAXWORD - maximum size of a term
- 2) MAXWDF - maximum value expected for the within document
- frequency for a term in the collecgtion.
- 3) LOW_THRESHOLD - threshold for LOW and MID frequency ranges
- 4) HIGH_THRESHOLD - threshold for MID and HIGH frequency ranges
-
- COMMAND LINE:
-
- select direct_file inverted_file output_file
-
- ****************************************************************************/
-
- #include <stdio.h>
- #include <string.h>
- #include <math.h>
- #define MAXWORD 20 /* maximum size of a term */
- #define MAXWDF 30 /* maximum WDF for a word in a database */
- #define LOW_THRESHOLD 2.0
- #define HIGH_THRESHOLD 4.0
-
- struct termlist { /* sequences of term and weight pairs */
- char term[MAXWORD]; /* term */
- float weight; /* term weight in document */
- struct termlist *nextterm;/* ptr. to next termlist record */
- } termlistfile;
-
- struct doclist { /* sequences of document # and weight pairs*/
- int doc; /* document number */
- float weight; /* term weight in document */
- struct doclist *nextdoc; /* ptr. to next doclist record */
- } doclistfile;
-
- struct direct { /* direct file: document to list of terms */
- int docnum; /* document # */
- struct termlist *terms; /* sequences of term and weight pairs */
- struct direct *next; /* ptr. to next direct record */
- } directfile;
-
- struct invert { /* inverted file: term to list of documnts */
- char term[MAXWORD]; /* term */
- struct doclist *doc; /* sequences of document # and weight pairs*/
- struct invert *next; /* ptr. to next invert record */
- } invfile;
-
- static struct invert *startinv; /* ptr. to first record in inverted file */
- static struct invert *lastinv; /* ptr. to last record in inverted file */
- static struct doclist *lastdoc; /* ptr. to last document in doclist */
- static struct direct *startdir; /* ptr. to first record in direct file */
- static struct direct *lastdir; /* ptr. to last record in direct file */
- static struct termlist *lastterm; /* ptr. to last term in termlist */
- static struct direct *start_centroid; /* ptr. to centroid record */
-
- static FILE *input; /* direct file */
- static FILE *input1; /* inverted file */
- static FILE *output; /* file to hold all output */
-
- static int currentdoc; /* tracks current document in direct file */
- static char currentterm[MAXWORD]; /* tracks current term in inverted file */
- static int Number_of_docs; /* total # of documents which is computed */
-
- static
- float av_doc_similarity(), /* compute average doc. similarity in dbse. */
- factorial(), /* compute factorial of a number */
- cosine(), /* compute cosine between two terms */
- dice(), /* compute dice beteen two terms */
- total_wdf(), /* compute total frequency of term in dbse. */
- cohesion(); /* compute cohesion between two terms */
- static
- void initialize(), /* initialize files and global variables */
- read_directfile(), /* read in the direct file */
- add_direct(), /* called within read_directfile() */
- pr_direct(), /* print the direct file */
- read_invfile(), /* read in the inverted file */
- add_invert(), /* called within read_invfile() */
- pr_invert(), /* print the inverted file */
- centroid(), /* compute the document centroid for dbse. */
- pr_centroid(), /* print the document centroid */
- get_Poisson_dist(), /* compute Poisson distributions for terms */
- Partition_terms(), /* partition terms by frequency */
- dv_all(), /* compute discrimination value of terms */
- get_doc_data(), /* get basic info. about documents */
- get_term_data(); /* get basic info. about terms */
-
- static struct direct *get_mem_direct(); /* these 4 get_mem functions are */
- static struct invert *get_mem_invert(); /* used to obtain memory for a */
- static struct doclist *get_mem_doclist(); /* record. The record type is */
- static struct termlist *get_mem_termlist(); /* obvious from the name */
-
- struct invert *find_term(); /* searches for term in inverted file*/
- /* and returns its address */
- int main(argc,argv)
- int argc;
- char *argv[];
- {
-
- char ch, word1[MAXWORD], word2[MAXWORD];
-
- if (argc!=4)
- {
- (void) printf("There is an error in the command line\n");
- (void) printf("Correct usage is\n");
- (void) printf("select direct_file inverted_file output_file\n");
- exit(1);
- }
-
- initialize(argv);
- (void) fprintf(output,"\nREADING IN DIRECT FILE\n");
- read_directfile();
- (void) fprintf(output,"\nPRINTING DIRECT FILE\n\n");
- pr_direct();
- (void) fprintf(output,"\nNUMBER OF DOCUMENTS IS: %d\n\n",Number_of_docs);
- (void) fprintf(output,"\nREADING IN INVERTED FILE\n");
- read_invfile();
- (void) fprintf(output,"\nPRINTING INVERTED FILE\n");
- pr_invert();
-
- (void) printf("\nPlease make a selection\n\n");
- (void) printf("To compute DV for all terms enter 1\n");
- (void) printf("To compute Poisson distributions enter 2\n");
- (void) printf("To partition terms by frequency enter 3\n");
- (void) printf("To compute cohesion between two terms (for phrase construction) enter 4\n");
- (void) printf("To compute Dice's coefficient between two terms enter 5\n");
- (void) printf("To quit enter 6\n\n");
- (void) printf("Enter your choice: ");
- ch = getchar();
- switch(ch)
- {
- case '1':
- centroid();
- (void) fprintf(output,"\nCENTROID\n\n");
- pr_centroid();
- (void) fprintf(output,"\nDISCRIMINATION VALUES FOR ALL TERMS\n\n");
- dv_all();
- break;
- case '2':
- (void) fprintf(output,"\nACTUAL AND POISSON DISTRIBUTIONS OF WITHIN DOCUMENT FREQUENCIES FOR ALL TERMS\n\n");
- (void) fprintf(output,"WDF = Within Document Frequency & #docs = Number of documents\n\n");
- get_Poisson_dist();
- break;
- case '3':
- (void) printf("Make sure that the threshold parameters are set correctly in the programm\n");
- (void) fprintf(output,"\nPARTITIONING THE TERMS INTO LOW, MEDIUM, HIGH FREQUENCY CLASSES\n\n");
- Partition_terms();
- break;
- case '4':
- (void) printf("enter first word: ");
- (void) scanf("%s",word1);
- if (find_term(word1) == NULL) {
- printf("sorry, %s is not in the inverted file\n",word1);
- break;
- }
- (void) printf("enter second word: ");
- (void) scanf("%s",word2);
- if (find_term(word2) == NULL) {
- printf("sorry, %s is not in the inverted file\n",word2);
- break;
- }
- (void) fprintf(output,"Cohesion between %s and %s is %f\n",word1,word2,cohesion(word1,word2));
- break;
- case '5':
- (void) printf("enter first word: ");
- (void) scanf("%s",word1);
- if (find_term(word1) == NULL) {
- printf("sorry, %s is not in the inverted file\n",word1);
- break;
- }
- (void) printf("enter second word: ");
- (void) scanf("%s",word2);
- if (find_term(word2) == NULL) {
- printf("sorry, %s is not in the inverted file\n",word2);
- break;
- }
- (void) fprintf(output,"Dice's coefficient between %s and %s is %f\n",word1,word2,dice(word1,word2));
- break;
- case '6':
- exit(0);
- default:
- (void) printf("no selection made\n");
- }
-
- (void) fclose(input1); (void) fclose(input); (void) fclose(output);
- return(0);
-
- }
- /****************************************************************************
-
- initialize(argv)
-
- Returns: void
-
- Purpose: Open all required files and initialize global variables
-
- **/
-
- static void initialize(argv)
- char *argv[]; /* in: holds the three parameters input at the command line */
- {
- if (( input = fopen(argv[1],"r")) == NULL ) {
- (void) printf("couldn't open file %s\n",argv[1]);
- exit(1); /* input direct file */
- }
- if (( input1 = fopen(argv[2],"r")) == NULL ) {
- (void) printf("couldn't open file %s\n",argv[2]);
- exit(1); /* input inverted file */
- }
- if (( output = fopen(argv[3],"w")) == NULL) {
- (void) printf("couldn't open file %s for output\n",argv[3]);
- exit(1); /* output file */
- }
- /* set initial values of global variables */
- startinv = NULL; lastinv = NULL; lastdoc = NULL;
- startdir = NULL; lastdir = NULL; lastterm = NULL;
- start_centroid = NULL;
- currentdoc = 0; currentterm[0] = '\0'; Number_of_docs = 0;
- }
-
- /****************************************************************************/
- /*
- read_directfile()
-
- Returns: void
-
- Purpose: Read in the direct file entries from the 1st input file
-
- **/
-
- static void read_directfile()
- {
- int docid; /* holds the current document number */
- char temp[MAXWORD]; /* holds the current term */
- float weight; /* holds the current term weight */
- struct termlist *p; /* structure to store the term-weight pair */
-
- (void) fscanf(input,"%d%s%f",&docid,temp,&weight); /* read the next line */
- while (docid > 0)
- /* while its found a legitimate document number */
- {
- if (docid == currentdoc) {
- /* if this document number has previously been entered in direct file */
- /* then only need to attach a termlist record to the same entry */
- p = get_mem_termlist(); /* get memory for a termlist record */
- (void) strncpy(p->term,temp,MAXWORD); /* copy the new word over */
- p->weight = weight; /* assign the new weight over */
- p->nextterm = NULL;
- if (lastterm) lastterm->nextterm = p; /* connect p to the termlist
- chain for this document */
- lastterm = p; /* set this global variable */
- }
- else { /* else docid represents a new document */
- Number_of_docs = Number_of_docs +1; /* increment global variable */
- add_direct(docid,temp,weight); /* starts a brand new entry in */
- /* the direct file */
- }
- docid = 0;
- (void) fscanf(input,"%d%s%f",&docid,temp,&weight);
- }
- }
- /***************************************************************************
-
- add_direct(docid,temp,weight)
-
- Returns: void
-
- Purpose: Start a new entry in the direct file. It is called in
- the read_directfile function when a new document number is
- read from the input file.
- **/
-
- static void add_direct(docid,temp,weight)
- int docid; /* in: new document number */
- char temp[MAXWORD]; /* in: index term */
- float weight; /* in: index term weight */
- {
- struct direct *p; /* structure p will be attached to direct file */
-
- p = get_mem_direct(); /* get memory for p */
- p->docnum = docid; /* assign the document number */
- p->terms = get_mem_termlist(); /* get memory for termlist structure */
- (void) strncpy(p->terms->term,temp,MAXWORD); /* assign index term to it */
- p->terms->weight = weight; /* assign term weight to it */
- p->terms->nextterm = NULL; /* current end of termlist */
- p->next = NULL; /* current end of direct file */
- if (startdir == NULL) startdir = p; /* if this is the very first document
- then global variable pointing to start of direct file should be updated */
- if (lastdir) lastdir->next = p; /* update pointer to last direct file rec. */
- lastdir = p;
- lastterm = p->terms; /* update pointer to last term */
- currentdoc = docid; /* update the global variable currentdoc to the */
- /* document number just added */
- }
- /***************************************************************************
- pr_direct()
-
- Returns: void
-
- Purpose: Print the direct file. It prints sequences of
- document# term weight.
-
- **/
-
- static void pr_direct()
- {
- struct direct *dir_addr; /* tracks address of current direct file record */
- struct termlist *term_addr; /* tracks address of current termlist record */
-
- dir_addr = startdir; /* start with beginning of direct file */
- while (dir_addr) { /* check for legitimate direct file record */
- (void) fprintf(output,"DOCUMENT NUMBER: %d \n",dir_addr->docnum);
- (void) fprintf(output," TERM TERM WEIGHT\n");
- term_addr = dir_addr->terms; /* get addr. of first term */
- while (term_addr) { /* loop through all the terms */
- (void) fprintf(output," %-30s ",term_addr->term);
- (void) fprintf(output,"%-10.3f\n",term_addr->weight);
- term_addr = term_addr->nextterm; /* go to next term for the doc. */
- }
- (void) fprintf(output,"\n");
- dir_addr = dir_addr->next; /* go to next direct file record */
- }
- }
- /***************************************************************************
-
- read_invfile()
-
- Returns: void
-
- Purpose: Read in the inverted file entries from 2nd input file
-
- **/
-
- static void read_invfile()
- {
- int docid; /* holds currenct document number */
- char temp[MAXWORD]; /* holds current term */
- float weight; /* holds current term weight */
- struct doclist *p; /* structure to store doc#-weight pair */
-
- (void) fscanf(input1,"%s%d%f",temp,&docid,&weight); /* read next line */
- while (strlen(temp) > 0)
- /* while its found a legitimate term */
- {
- if (!strncmp(currentterm,temp,MAXWORD)) {
- /* if this term has previously been entered in inverted file */
- /* then only need to attach a doclist record to same term entry */
- p = get_mem_doclist(); /* get memory for doclist record */
- p->doc = docid; /* assign document number */
- p->weight = weight; /* assign weight */
- p->nextdoc = NULL;
- if (lastdoc) lastdoc->nextdoc = p; /* connect p to the doclist
- chain for this term */
- lastdoc = p; /* set this global variable */
- }
- else add_invert(docid,temp,weight);
- /* else term is a brand new term & need to make a new inverted file entry */
- temp[0] = '\0';
- (void) fscanf(input1,"%s%d%f",temp,&docid,&weight); /* read next line */
- }
- }
- /***************************************************************************
-
- add_invert(docid,temp,weight);
-
- Returns: void
-
- Purpose: Start a new entry in the inverted file. It is called in the
- read_invfile function when a new term is read from the
- input file
-
- **/
-
- static void add_invert(docid,temp,weight)
- int docid; /* in: document number */
- char temp[MAXWORD]; /* in: new index term */
- float weight; /* in: index term weight */
- {
- struct invert *p; /* structure p will be attached to inverted file */
-
- p = get_mem_invert(); /* get memory for p */
- (void) strncpy(p->term,temp,MAXWORD); /* copy over the term */
- p->doc = get_mem_doclist(); /* start a doclist structure */
- p->doc->doc = docid; /* assign document number */
- p->doc->weight = weight; /* assign term weight */
- p->doc->nextdoc = NULL;
- p->next = NULL;
- if (startinv == NULL) startinv = p;
- /* if this is the first entry in inverted file, then update global var. */
- if (lastinv) lastinv->next = p; /* update ptr. to last inverted file record */
- lastinv = p;
- lastdoc = p->doc; /* update ptr. to last document */
- (void) strncpy(currentterm,temp,MAXWORD); /* update global var. currentterm to the */
- /* new term just entered */
- }
- /***************************************************************************
-
- pr_invert()
-
- Returns: void
-
- Purpose: Print the inverted file. It prints sequences of
- term document# weight.
-
- **/
-
- static void pr_invert()
- {
- struct invert *inv_addr; /* tracks address of current inverted file record */
- struct doclist *doc_addr; /* tracks address of current doclist record */
-
- inv_addr = startinv; /* start with beginning of inverted file */
- while (inv_addr) { /* check for legitimate inverted file record */
- (void) fprintf(output,"TERM: %s\n",inv_addr->term);
- (void) fprintf(output," DOCUMENT NUMBER TERM WEIGHT\n");
- doc_addr = inv_addr->doc; /* get addr. of first document */
- while (doc_addr) { /*loop through all the associated doc.#s and weights*/
- (void) fprintf(output," %-30d ",doc_addr->doc);
- (void) fprintf(output,"%-10.5f\n",doc_addr->weight);
- doc_addr = doc_addr->nextdoc; /* get addr. of next document */
- }
- (void) fprintf(output,"\n");
- inv_addr = inv_addr->next; /* go to next inverted file record */
- }
- }
- /***************************************************************************
-
- centroid()
-
- Returns: void
-
- Purpose: Compute and return the centroid for the documents of
- the database.
-
- Notes: Centroid is stored as a direct file record. Document
- number given to it is Number_of_docs + 1. The centroid is
- computed by determining for each index term in the inverted
- file, its average weight in the database. These average
- weights are then stored in the direct file entry for the
- centroid. (Note that these average weights are not used
- anywhere, but could be used in computing DV for terms).
- **/
-
- static void centroid()
- {
- struct invert *inv_addr; /* tracks address of current inverted file record */
- struct doclist *doc_addr; /* tracks address of current doclist record */
- float total_weight, /* tracks total weight for each term */
- av_term_weight; /* holds average term weight for each term */
- struct termlist *q, /* structure used to create centroid entry in
- the direct file */
- *lastterm; /* tracks the last term in the centroid */
-
- start_centroid = get_mem_direct(); /* centroid stored as direct file record */
- start_centroid->docnum = Number_of_docs +1; /* assign its pseudo doc.# */
- start_centroid->next = NULL; /* end of direct file chain */
- lastterm = NULL;
- inv_addr = startinv; /* begin at top of inverted file */
- while (inv_addr) { /* while there is a legitimate inv. file record... */
- doc_addr = inv_addr->doc; /* get address of first document */
- total_weight = 0.0; /* start with a 0 total weight for this term */
- while (doc_addr) { /* if this is a legitimate doc. addr. */
- total_weight = total_weight + doc_addr->weight;
- /* update total weight for term */
- doc_addr = doc_addr->nextdoc; /* loop through all docs. for the term */
- }
- av_term_weight = total_weight/Number_of_docs;
- /* calculating average term wt. */
- q = get_mem_termlist();
- (void) strncpy(q->term,inv_addr->term,MAXWORD);
- q->weight = av_term_weight;
- q->nextterm = NULL;
- if (lastterm == NULL) start_centroid->terms = q;
- /* if this is the first term entry for the centroid */
- else lastterm->nextterm = q;
- /* else connect this term to the centroid's termlist chain */
- lastterm = q;
- inv_addr = inv_addr->next; /* go on to the next inverted file entry */
- }
- }
- /***************************************************************************
-
- pr_centroid()
-
- Returns: void
-
- Purpose: Print the centroid from the direct file
-
- **/
-
- static void pr_centroid()
- {
- struct termlist *term_addr; /* tracks address of current termlist record */
-
- /* note the centroid is given a document number = Number_of_docs + 1 */
- /* therefore it may be treated as a special kind of document vector */
- if (start_centroid) {
- /* if there is a centroid */
- (void) fprintf(output,"---------------------------------------\n");
- (void) fprintf(output,"TERM WEIGHT \n");
- (void) fprintf(output,"---------------------------------------\n");
- term_addr = start_centroid->terms; /* get first term address */
- while (term_addr) { /* printing out term and weight pairs */
- (void) fprintf(output,"%-30s ",term_addr->term);
- (void) fprintf(output,"%-10.5f\n",term_addr->weight);
- term_addr = term_addr->nextterm; /* loop through all terms */
- }
- (void) fprintf(output,"\n");
- }
- }
- /***************************************************************************
-
- get_Poisson_dist()
-
- Returns: void
-
- Purpose: Get the Poisson distribution data for any term
-
- Notes: This function has two parts:
- PART I: Determine the actual within doc. freq. distribution
- PART II: Determine the distribution anticipated under the
- single Poisson model.
- It is assumed that the within document frequency of a term
- is stored as the term weight in the inverted file.
-
- **/
-
- static void get_Poisson_dist()
- {
- struct invert *inv_addr; /* tracks address of current inverted file record */
- struct doclist *doc_ad; /* tracks address of current doclist record */
- float dist[MAXWDF][2]; /* store for each term */
- /* column 1 = within document frequency (wdf) */
- /* column 2 = document frequency */
- int i, /* counter to add information to the dist array */
- j, /* counter to loop through dist array */
- found, /* flag used to match wdf in dist array */
- numdocs, /* counter to track # of docs. with the same wdf */
- docs_with_term; /* tracks the number of documents having the term */
- float first, /* these five local variables are */
- second, /* used to determine expected distribution */
- result,
- exponent,
- lambda; /* single Poisson parameter */
-
- inv_addr = startinv; /* start at the beginning of the inverted file */
-
- /* PART I: For each term determine the number of documents in the
- collection that have a particular wdf */
- while (inv_addr) { /* check for legitimate inv. file record */
- docs_with_term = 0;
- doc_ad = inv_addr->doc; /* get the first doc. address */
- i = 0; /* used to check if this is the very first entry in dist */
- while (doc_ad) {
- if (i == 0) { /* if first entry in dist */
- dist[i][0] = doc_ad->weight; dist[i][1] = 1;
- /* assign wdf and doc. frequency = 1 to first row in dist */
- i++;
- docs_with_term++;
- }
- else { /* dist already has other entries, hence look for
- any previous entries for the same wdf value */
- found = 0;
- for (j=0;j<i;j++) { /* loop through dist */
- if (dist[j][0] == doc_ad->weight) { /* if found the same wdf */
- dist[j][1] = dist[j][1] + 1; /* add 1 to the doc. frequency */
- found = 1;
- docs_with_term++;
- }
- } /* ending for */
- if (found == 0) { /* if not found the same wdf in dist */
- /* start new row in dist */
- dist[i][0] = doc_ad->weight; dist[i][1] = 1;
- i++;
- docs_with_term++;
- }
- } /* ending else */
- doc_ad = doc_ad->nextdoc;/* loop through other documents for same term */
- } /* ending while */
- /* ending if */
-
- /* now print out actual distribution information for this term */
- (void) fprintf(output,"\nTerm = %s\n",inv_addr->term);
- (void) fprintf(output,"\nActual Distribution: ");
- (void) fprintf(output," WDF #docs\n");
- (void) fprintf(output," 0 %d\n",Number_of_docs-docs_with_term);
- for (j=0;j<i;j++)
- (void) fprintf(output," %-16.0f %-6.0f\n",dist[j][0],dist[j][1]);
-
- /* PART II: */
- /* computing lambda - the only single Poisson parameter */
-
- (void) fprintf(output,"\nExpected Distribution: ");
- (void) fprintf(output," WDF #docs\n");
- /* call the function total_wdf to compute the total frequency of the term */
- lambda = (total_wdf(inv_addr->term))/Number_of_docs;
- first = exp(-lambda);
- numdocs = -1; j = -1;
- /* computing document frequency for each within document frequency value */
- while (numdocs != 0) {
- j = j + 1;
- exponent = j; /* type conversion necessary for pow function */
- if (j == 0) result = first * Number_of_docs;
- else {
- second = pow(lambda,exponent);
- result = (((first * second)/factorial(j)) * Number_of_docs);
- }
- if ((result - floor(result)) < 0.5) numdocs = floor(result);
- else numdocs = ceil(result);
- (void) fprintf (output," %-16d %-6d\n",j,numdocs);
- }
- inv_addr = inv_addr->next; /* continue with the next inverted file term */
- } /* end while */
- }
-
- /***************************************************************************
-
- factorial(n)
-
- Returns: float
-
- Purpose: Return the factorial of a number. Used in get_poisson_dist
-
- **/
-
- static float factorial(n)
- int n; /* in: compute factorial for this parameter */
- {
- float answer; /* holds the result */
-
- if (n==1) return(1.0);
- answer = factorial(n-1)*n;
- return(answer);
- }
-
- /***************************************************************************
-
- total_wdf(term)
-
- Returns: float
-
- Purpose: Compute total frequency in the database for a specified
- term using the inverted file.
-
- Notes: It is assumed that the appropriate term frequency is stored
- as the term weight in the inverted file. This routine can
- also be used to filter out the low and high frequency terms.
- The resulting mid frequency terms can be used as input to
- the program which generates hierarchies.
-
- **/
-
- static float total_wdf(term)
- char term[MAXWORD]; /* in: term for which total frequency is to be found */
- {
- struct invert *inv_addr; /* tracks current inverted file record */
- struct doclist *doc_addr; /* tracks current doclist record */
- float total; /* tracks the total frequency */
-
- total = 0.0; /* initial value */
- /* function find_term will find out where the term is in the inverted file */
- inv_addr = find_term(term);
- if (inv_addr) { /* if this term was found in the inverted file */
- doc_addr = inv_addr->doc; /* get first associated document address */
- while (doc_addr) { /* update the total frequency */
- total = total + doc_addr->weight;
- doc_addr = doc_addr->nextdoc; /* loop through other associated docs. */
- }
- }
- return(total);
- }
- /***************************************************************************
-
- Partition_terms()
-
- Returns: void
-
- Purpose: Assign each term in the inverted file to one class:
- HIGH, MEDIUM or LOW frequency depending upon its total
- frequency in the collection. This function utilizes
- two parameters defined at the top of the program:
- LOW_THRESHOLD and HIGH_THRESHOLD, which should be set
- by the user.
-
- **/
-
- static void Partition_terms()
- {
- struct invert *inv_addr; /* tracks address of current inverted file record */
- float total; /* holds total frequency of each term */
-
- inv_addr = startinv; /* start at the beginning of the inverted file */
- (void) fprintf(output,"\nTerm - Total Frequency - Frequency Class\n\n");
-
- while (inv_addr) { /* if a legitimate address */
- /* compute total frequency for term in collection */
- total = total_wdf(inv_addr->term);
- (void) fprintf(output,"\n%s - %f -",inv_addr->term,total);
- if (total < LOW_THRESHOLD) (void) fprintf(output," LOW\n");
- else if (total > HIGH_THRESHOLD) (void) fprintf(output," HIGH\n");
- else (void) fprintf(output," MEDIUM\n");
- inv_addr = inv_addr->next; /* continue with next inverted file entry */
- }
- }
-
- /***************************************************************************
-
- cohesion(term1,term2)
-
- Returns: float
-
- Purpose: Compute the cohesion between two terms
-
- **/
-
- static float cohesion(term1,term2)
- char term1[MAXWORD], /* in: the two terms which are being */
- term2[MAXWORD]; /* in: compared to determine cohesion */
- {
- float l1, /* holds # of documents associated with term1 */
- l2, /* holds # of documents associated with term2 */
- common; /* holds # of documents in common */
-
- get_term_data(term1,term2,&l1,&l2,&common);
- return(common/(sqrt(l1 * l2)));
- }
-
- /***************************************************************************
-
- dv_all()
-
- Returns: void
-
- Purpose: Compute Discrimination Value (DV) for all terms in the
- database
-
- Notes: Similarity between two documents as calculated here is a
- function of the number of terms in common and the number
- of terms in each. Term weights are not involved
-
- **/
-
- static void dv_all()
- {
- struct invert *inv_addr; /* tracks address of current inv. file record */
- float DV, /* holds computed DV */
- baseline; /* holds baseline similarity */
-
- /* first compute baseline similarity */
-
- baseline = av_doc_similarity("-"); /* the dummy term '-' is used for this */
- (void) fprintf(output,"-----------------------------------------\n");
- (void) fprintf(output,"TERM DV\n");
- (void) fprintf(output,"-----------------------------------------\n");
- inv_addr = startinv; /* begin at top of inverted file */
- while (inv_addr) { /* if legitimate inverted file record */
- DV = av_doc_similarity(inv_addr->term) - baseline;
- (void) fprintf(output,"%-30s %-10.5f \n",inv_addr->term,DV);
- inv_addr = inv_addr->next; /* go to next inverted file record */
- }
- }
-
- /***************************************************************************
-
- av_doc_similarity(term)
-
- Returns: float
-
- Purpose: Compute average similarity between each document
- and the centroid of the database. The word specified in
- term is ignored during computations.
-
- **/
-
- static float av_doc_similarity(term)
- char term[MAXWORD]; /* in: term is ignored during computations */
- {
- struct direct *dir_addr; /* tracks current direct file record */
- float dl1, /* holds # of terms in document */
- dl2, /* holds # of terms in centroid */
- common, /* holds # of terms in common between them */
- total_sim; /* holds total similarity between them */
-
- total_sim = 0.0;
- dir_addr = startdir; /* begin with first direct file record */
- while (dir_addr) {
- /* get_doc_data returns #of terms in each document and #of terms in common */
- get_doc_data(dir_addr,start_centroid,&dl1,&dl2,&common,term);
- total_sim = total_sim + cosine(dl1,dl2,common);
- dir_addr = dir_addr->next; /* go to next direct file record */
- }
- return(total_sim/Number_of_docs);
- }
-
- /***************************************************************************
-
- dice(term1,term2)
-
- Returns: float
-
- Purpose: Returns Dice's coefficient of similarity between
- any two documents
-
- **/
-
- static float dice(term1,term2)
- char term1[MAXWORD], /* in: the two terms that are being compared */
- term2[MAXWORD]; /* in: */
- {
- float l1, l2, common;
-
- get_term_data(term1,term2,&l1,&l2,&common);
-
- if (l1 == 0 || l2 == 0) return(0.0);
- return(common/(l1 + l2));
- }
-
- /***************************************************************************
-
- cosine(l1,l2,common)
-
- Returns: float
-
- Purpose: Returns cosine similarity between two documents
-
- **/
-
- static float cosine(l1,l2,common)
- float l1, /* in: # of terms associated with document 1 */
- l2, /* in: # of terms associated with document 2 */
- common; /* in: # of terms in common between them */
- {
- float temp;
-
- if (l1 == 0 || l2 == 0) return(0.0);
- temp = sqrt(l1 * l2);
- return(common/temp);
- }
-
- /***************************************************************************
-
- get_doc_data(p,q,l1,l2,common,index)
-
- Returns: void
-
- Purpose: Given two document numbers, it determines the number of
- of index terms in each and in common. It will exclude the index
- term (specified as the last parameter) from consideration. Used
- in av_doc_similarity for DV calculations.
-
- **/
-
- static void get_doc_data(p,q,l1,l2,common,index)
- char index[MAXWORD]; /* in: term to be excluded from computations */
- struct direct *p, *q; /* in: addresses of two documents numbers */
- float *l1, *l2; /* out: number of terms in each document */
- float *common; /* out: number of terms in common */
- {
- struct termlist *term_addr1, /* holds address of first docs. termlist */
- *term_addr2; /* holds address of second docs. termlist */
- int count1, /* number of terms in first doc. */
- count2, /* number of terms in second doc. */
- com; /* number of terms in common */
-
- term_addr1 = p->terms;
- count1 = 0; count2 = 0; com = 0;
- /* first find out number of terms in document 1 & # of common terms */
- while (term_addr1) {
- if (strncmp(term_addr1->term,index,MAXWORD)) count1 = count1 + 1;
- /* if its not the term to exclude */
- term_addr2 = q->terms;
- while (term_addr2) {
- if (!strncmp(term_addr1->term,term_addr2->term,MAXWORD)) {
- /* if the two terms are the same */
- if (strncmp(term_addr1->term,index,MAXWORD)) com = com + 1;
- /* if they do not match the term to exclude */
- break;
- }
- term_addr2 = term_addr2->nextterm;
- }
- term_addr1 = term_addr1->nextterm;
- }
- /* now find out number of terms in document 2 */
- term_addr2 = q->terms;
- while (term_addr2) {
- if (strncmp(term_addr2->term,index,MAXWORD)) count2 = count2 + 1;
- term_addr2 = term_addr2->nextterm;
- }
- *l1 = count1; *l2 = count2; *common = com;
- }
-
- /***************************************************************************
-
- get_term_data(term1,term2,l1,l2,common)
-
- Returns: void
-
- Purpose: Get info regarding number of documents in common
- between any two terms.
-
- **/
-
- static void get_term_data(term1,term2,l1,l2,common)
- char term1[MAXWORD], /* in: term 1 to be compared with */
- term2[MAXWORD]; /* in: term 2 */
- float *l1, /* out: # of documents associated with 1st term */
- *l2, /* out: # of documents associated with 2nd term */
- *common; /* out: # of documents in common between them */
- {
- struct invert *p, /* holds address of first term in inverted file */
- *q; /* holds address of second term in inv. file */
- struct doclist *doc_ad1,/* tracks doclist for first term */
- *doc_ad2;/* tracks doclist for second term */
- int count1, /* holds # of documents associated with 1st term */
- count2, /* holds # of documents associated with 2nd term */
- com; /* holds # of documents common between them */
-
- /* find addresses of both terms in the inverted file */
- p = find_term(term1); q = find_term(term2);
- doc_ad1 = p->doc; /* obtain 1st terms doclist address */
- count1 = 0; count2 = 0; com = 0;
-
- /* first get # of documents indexed by term 1 & # of docs. in common */
- while (doc_ad1) {
- count1 = count1 +1;
- doc_ad2 = q->doc;
- while (doc_ad2) {
- if (doc_ad1->doc == doc_ad2->doc) {
- /* if the document numbers are the same */
- com = com +1;
- break;
- }
- doc_ad2 = doc_ad2->nextdoc;
- }
- doc_ad1 = doc_ad1->nextdoc;
- }
-
- /* now get # of documents indexed by term 2 */
- doc_ad2 = q->doc;
- while (doc_ad2) {
- count2 = count2 + 1;
- doc_ad2 = doc_ad2->nextdoc;
- }
- *l1 = count1; *l2 = count2; *common = com;
- }
-
- /***************************************************************************
-
- *find_term(term)
-
- Returns: address of a struct invert record
-
- Purpose: search for a specified term in the inverted file &
- return address of the record
-
- **/
-
- struct invert *find_term(term)
- char term[MAXWORD]; /* in: term to be located in inverted file */
- {
- struct invert *inv_addr; /* tracks addr. of current rec. in inv. file */
- inv_addr = startinv; /* begin at top of inv. file */
-
- while(inv_addr) {
- if (!strcmp(term,inv_addr->term)) {return(inv_addr);}
- inv_addr = inv_addr->next;
- }
- (void) fprintf(output,"Findterm routine: Term %s not found\n",term);
- return(NULL);
- }
-
- /***************************************************************************
-
- *get_mem_direct()
-
- Returns: address of a struct direct record
-
- Purpose: dynamically obtain enough memory to store 1 direct record
-
- **/
-
- static struct direct *get_mem_direct()
- {
- struct direct *record;
-
- record = (struct direct *)malloc(sizeof(directfile));
- if (!record) {
- (void) fprintf(output,"\nout of memory\n");
- exit(0);
- }
- return(record);
- }
-
- /***************************************************************************
-
- *get_mem_termlist()
-
- Returns: address of a struct termlist record
-
- Purpose: dynamically obtain enough memory to store one termlist record
-
- **/
-
- static struct termlist *get_mem_termlist()
- {
- struct termlist *record;
-
- record = (struct termlist *)malloc(sizeof(termlistfile));
- if (!record) {
- (void) fprintf(output,"\nout of memory\n");
- exit(0);
- }
- return(record);
- }
-
- /***************************************************************************
-
- *get_mem_invert()
-
- Returns: address of a struct invert record
-
- Purpose: dynamically obtain enough memory to store one inverted
- file record
-
- **/
-
- static struct invert *get_mem_invert()
- {
- struct invert *record;
-
- record = (struct invert *)malloc(sizeof(invfile));
- if (!record) {
- (void) fprintf(output,"\nout of memory\n");
- exit(0);
- }
- return(record);
- }
-
- /***************************************************************************
-
- *get_mem_doclist()
-
- Returns: address of a struct doclist record
-
- Purpose: dynamically obtain enough memory to store one doclist
- record
-
- **/
-
- static struct doclist *get_mem_doclist()
- {
- struct doclist *record;
-
- record = (struct doclist *)malloc(sizeof(doclistfile));
- if (!record) {
- (void) (void) fprintf(output,"\nout of memory\n");
- exit(0);
- }
- return(record);
- }
-
-